Summary & Descriptive Statistics
# Number of cells in HNA and LNA across all samples
# For a plot of these values, see figure 1 below
df_cells %>%
group_by(FCM_type) %>%
summarise(num_samples = n(),
mean_cells = round(mean(num_cells), digits = 2),
sd_mean_cells = round(sd(num_cells), digits = 2),
median_cells = round(median(num_cells), digits = 2)) %>%
datatable(caption = "Mean and median number of cells per ecosystem", rownames = FALSE)
# Are there more total cells in one lake over the other?
totcells_df <- df_cells %>%
filter(FCM_type == "Total")
# Compute the analysis of variance
totcells_aov <- aov(num_cells ~ Lake, data = totcells_df)
summary(totcells_aov) # there is a difference, but which lake?
## Df Sum Sq Mean Sq F value Pr(>F)
## Lake 2 4.332e+14 2.166e+14 37.75 2.72e-14 ***
## Residuals 170 9.755e+14 5.738e+12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Which samples are significant from each other?
TukeyHSD(totcells_aov) # Michigan is significantly different form Muskegon and Inland
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = num_cells ~ Lake, data = totcells_df)
##
## $Lake
## diff lwr upr p adj
## Inland-Michigan 3417194.2 2411737 4422651.4 0.0000000
## Muskegon-Michigan 3030294.1 1939004 4121584.0 0.0000000
## Muskegon-Inland -386900.1 -1489077 715276.9 0.6849527
# Number of cells in HNA and LNA per ecosystem
df_cells %>%
group_by(Lake, FCM_type) %>%
summarise(num_samples = n(),
mean_cells = round(mean(num_cells), digits = 2),
median_cells = round(median(num_cells), digits = 2)) %>%
datatable(caption = "Mean and median number of cells per ecosystem", rownames = FALSE)
# Proportion of HNA and LNA across all samples
df_cells %>%
dplyr::select(samples, Lake, FCM_type, num_cells) %>%
spread(FCM_type, num_cells) %>%
mutate(prop_HNA = HNA/Total * 100,
prop_LNA = LNA/Total * 100) %>%
dplyr::select(Lake, prop_HNA, prop_LNA) %>%
summarize(mean_HNA = round(mean(prop_HNA), digits = 2),
sd_HNA = round(sd(prop_HNA), digits = 2),
mean_LNA = round(mean(prop_LNA), digits = 2),
sd_LNA = round(sd(prop_LNA), digits = 2))
## mean_HNA sd_HNA mean_LNA sd_LNA
## 1 30.41 9.06 69.59 9.06
# Proportion of HNA and LNAper ecosystem
prop_stats <- df_cells %>%
dplyr::select(samples, Lake, FCM_type, num_cells) %>%
spread(FCM_type, num_cells) %>%
mutate(prop_HNA = HNA/Total * 100,
prop_LNA = LNA/Total * 100) %>%
dplyr::select(Lake, prop_HNA, prop_LNA) %>%
rename(HNA = prop_HNA, LNA = prop_LNA) %>%
group_by(Lake) %>%
summarize(mean_HNA = round(mean(HNA), digits = 2),
mean_LNA = round(mean(LNA), digits = 2),
min_HNA = round(min(HNA), digits = 2),
max_HNA = round(max(HNA), digits = 2),
min_LNA = round(min(LNA), digits = 2),
max_LNA = round(max(LNA), digits = 2),
num_samples = n())
datatable(prop_stats, caption = "Statistics of the percentage of each flow cytometry group across the systems", rownames = FALSE)
Number of cells per system
# Load in the data from each lake
musk_all_df <- read.table(file="data/Chloroplasts_removed/ByLake_Filtering/5in10/muskegon/muskegon_sampledata_5in10.tsv", header = TRUE) %>%
mutate(norep_filter_name = paste(substr(Sample_16S,1,4), substr(Sample_16S,6,9), sep = "")) %>%
arrange(norep_filter_name)
mich_all_df <- read.table(file="data/Chloroplasts_removed/ByLake_Filtering/5in10/michigan/michigan_sampledata_5in10.tsv", header = TRUE) %>%
mutate(norep_filter_name = paste(substr(Sample_16S,1,4), substr(Sample_16S,6,9), sep = "")) %>%
arrange(norep_filter_name)
inla_all_df <- read.table(file="data/Chloroplasts_removed/ByLake_Filtering/5in10/inland/inland_sampledata_5in10.tsv", header = TRUE) %>%
mutate(norep_filter_name = paste(substr(Sample_16S,1,4), substr(Sample_16S,6,9), sep = "")) %>%
arrange(norep_filter_name)
stopifnot(colnames(musk_all_df) == colnames(mich_all_df))
stopifnot(colnames(musk_all_df) == colnames(inla_all_df))
lakes <- bind_rows(musk_all_df, mich_all_df, inla_all_df)
ggplot(df_cells, aes(x = FCM_type, y = num_cells, fill = Lake, color = Lake, shape = Lake)) +
geom_point(size = 1.5, position = position_jitterdodge(), color = "black") +
geom_boxplot(alpha = 0.7, outlier.shape = NA, show.legend = FALSE, color = "black") +
scale_color_manual(values = lake_colors, guide = "none") +
scale_fill_manual(values = lake_colors) +
scale_shape_manual(values = lake_shapes) +
labs(y = "Number of Cells (cells/mL)", x = "FCM Type") +
mytheme + theme(legend.title = element_blank(), legend.position = c(0.01, 0.95)) +
guides(colour = guide_legend(override.aes = list(size=2.5)),
shape = guide_legend(override.aes = list(size=2.5)),
fill = guide_legend(override.aes = list(size=2.5)))

Figure 1
######################## Analysis of HNA/LNA/Total Cells vs Total Productivity
# 1. Run the linear model
lm_HNA <- lm(tot_bacprod ~ HNA.cells, data = muskegon)
## 2. Extract the R2 and p-value from the linear model:
lm_HNA_stats <- paste("atop(R^2 ==", round(summary(lm_HNA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_HNA)$coefficients[,4][2]), digits = 5), ")")
# 3. Plot it
HNA_vs_prod <- ggplot(muskegon, aes(x = HNA.cells, y = tot_bacprod)) +
geom_errorbarh(aes(xmin = HNA.cells - HNA.sd, xmax = HNA.cells + HNA.sd), color = "grey", alpha = 0.8) +
geom_errorbar(aes(ymin = tot_bacprod - SD_tot_bacprod, max = tot_bacprod + SD_tot_bacprod), color = "grey", alpha = 0.8) +
geom_point(size = 2, shape = 22, fill = "deepskyblue4") +
geom_smooth(method = "lm", color = "deepskyblue4") +
labs(y = "Bacterial Production \n (μg C/L/day)", x = "HNA Cell Density \n(cells/mL)") +
scale_x_continuous(labels = function(x) format(x, scientific = TRUE),
breaks = c(2e+06, 3e+06)) +
annotate("text", x=1.65e+06, y=60, label=lm_HNA_stats, parse = TRUE, color = "black", size = 3) +
mytheme
# 1. Run the linear model
lm_LNA <- lm(tot_bacprod ~ LNA.cells, data = muskegon)
## 2. Extract the R2 and p-value from the linear model:
lm_LNA_stats <- paste("atop(R^2 ==", round(summary(lm_LNA)$adj.r.squared, digits = 3), ",",
"p ==", round(unname(summary(lm_LNA)$coefficients[,4][2]), digits = 2), ")")
# 3. Plot it
LNA_vs_prod <- ggplot(muskegon, aes(x = LNA.cells, y = tot_bacprod)) +
geom_errorbarh(aes(xmin = LNA.cells - LNA.sd, xmax = LNA.cells + LNA.sd), color = "grey", alpha = 0.8) +
geom_errorbar(aes(ymin = tot_bacprod - SD_tot_bacprod, max = tot_bacprod + SD_tot_bacprod), color = "grey", alpha = 0.8) +
geom_point(size = 2.5, shape = 22, fill = "darkgoldenrod1") +
labs(y = "Bacterial Production \n (μg C/L/day)", x = "LNA Cell Density \n(cells/mL)") +
geom_smooth(method = "lm", se = FALSE, linetype = "longdash", color = "darkgoldenrod1") +
annotate("text", x=2.75e+06, y=60, label=lm_LNA_stats, parse = TRUE, color = "red", size = 3) +
mytheme
# 1. Run the linear model
lm_total <- lm(tot_bacprod ~ Total.cells, data = muskegon)
## 2. Extract the R2 and p-value from the linear model:
lm_total_stats <- paste("atop(R^2 ==", round(summary(lm_total)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_total)$coefficients[,4][2]), digits = 2), ")")
# 3. Plot it
Total_vs_prod <- ggplot(muskegon, aes(x = Total.cells, y = tot_bacprod)) +
geom_errorbarh(aes(xmin = Total.cells - Total.count.sd, xmax = Total.cells + Total.count.sd), color = "grey", alpha = 0.8) +
geom_errorbar(aes(ymin = tot_bacprod - SD_tot_bacprod, max = tot_bacprod + SD_tot_bacprod), color = "grey", alpha = 0.8) +
scale_shape_manual(values = lake_shapes) +
geom_point(size = 2.5, shape = 22, fill = "black") +
labs(y = "Bacterial Production \n (μg C/L/day)", x = "Total Cell Density \n(cells/mL)") +
geom_smooth(method = "lm", color = "black") +
#geom_smooth(method = "lm", se = FALSE, linetype = "longdash", color = "red") +
annotate("text", x=5.25e+06, y=60, label=lm_total_stats, parse = TRUE, color = "black", size = 3) +
mytheme
###### ###### ###### ###### ###### ###### ###### ###### ###### ######
###### Correlation between HNA and LNA Across the three systems ######
## Is there a corrlation between HNA and LNA across ecosystems?
# 1. Run the linear model
lm_allNA_corr <- lm(LNA.cells ~ HNA.cells, data = lakes)
summary(lm(LNA.cells ~ HNA.cells * Lake, data = lakes))
##
## Call:
## lm(formula = LNA.cells ~ HNA.cells * Lake, data = lakes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3651006 -558525 -113725 511977 3997810
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.699e+06 3.590e+05 10.304 < 2e-16 ***
## HNA.cells 3.827e-01 1.704e-01 2.246 0.026 *
## LakeMichigan -2.995e+06 4.523e+05 -6.622 4.63e-10 ***
## LakeMuskegon -2.363e+06 5.411e+05 -4.367 2.21e-05 ***
## HNA.cells:LakeMichigan 5.404e-01 4.257e-01 1.269 0.206
## HNA.cells:LakeMuskegon 1.027e+00 2.549e-01 4.029 8.50e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1301000 on 167 degrees of freedom
## Multiple R-squared: 0.6116, Adjusted R-squared: 0.6
## F-statistic: 52.6 on 5 and 167 DF, p-value: < 2.2e-16
## 2. Extract the R2 and p-value from the linear model:
lm_allNA_corr_stats <- paste("atop(R^2 ==", round(summary(lm_allNA_corr)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_allNA_corr)$coefficients[,4][2]), digits = 24), ")")
# 3. Plot it
p2 <- ggplot(lakes, aes(x = HNA.cells, y = LNA.cells)) +
geom_errorbarh(aes(xmin = HNA.cells - HNA.sd, xmax = HNA.cells + HNA.sd), color = "grey", alpha = 0.8) +
geom_errorbar(aes(ymin = LNA.cells - LNA.sd, max = LNA.cells + LNA.sd), color = "grey", alpha = 0.8) +
geom_point(size = 2.5, alpha = 0.9, aes(fill = Lake, shape = Lake)) +
scale_fill_manual(values = lake_colors) +
scale_shape_manual(values = lake_shapes) +
geom_smooth(method = "lm", color = "black") +
labs(y = "LNA Cell Density (cells/mL)", x = "HNA Cell Density (cells/mL)") +
scale_x_continuous(labels = function(x) format(x, scientific = TRUE), limits = c(0, 6.1e+06)) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE), limits = c(0, 1e+07)) +
annotate("text", x=5e+06, y=0.8e+06, label=lm_allNA_corr_stats, parse = TRUE, color = "black", size = 3) +
mytheme + theme(legend.position = "none") #theme(legend.title = element_blank(), legend.position = c(0.01, 0.95))
###### MAKE THE PLOT
## Extract legends for plotting
plot_for_legend <-
ggplot(df_cells, aes(x = Lake, y = num_cells, fill = FCM_type, color = FCM_type)) +
geom_point(size = 1, shape = 22, position = position_jitterdodge()) +
scale_fill_manual(values = fcm_colors) +
scale_color_manual(values = fcm_colors, guide = "none") +
scale_shape_manual(values = 22) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE)) +
labs(y = "Number of Cells \n (cells/mL)", x = "Lake") +
theme(legend.position = "right",
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12),
legend.key.width=unit(1,"line"),
legend.key.height=unit(1,"line")) +
guides(fill = guide_legend(override.aes = list(size=3.5)))
legend1 <- get_legend(plot_for_legend)
legend2 <- get_legend(p2 + theme(legend.position = "right",
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12),
legend.key.width=unit(1,"line"),
legend.key.height=unit(1,"line")) +
guides(fill = guide_legend(override.aes = list(size=3.5))))
# place the legends
legend_positions <- plot_grid(legend1, legend2, nrow = 2, ncol = 1)
# Place the legends next to figure 1A
row1 <- plot_grid(NULL, p2, legend_positions,
labels = c("", "A", ""),
ncol = 3, nrow = 1,
rel_widths = c(0.5, 1, 0.5))
# Add Figures 1B, 1C, and 1D
row2 <- plot_grid(HNA_vs_prod + theme(legend.position = "none"),
LNA_vs_prod + theme(axis.title.y = element_blank(), legend.position = "none"),
Total_vs_prod + theme(axis.title.y = element_blank(), legend.position = "none"),
labels = c("B", "C", "D"),
ncol = 3, nrow = 1,
rel_widths = c(0.95, 0.8, 0.8))
# Final Figure 1
plot_grid(row1, row2,
nrow = 2, ncol = 1)

Figure 1A: Plotted By Lake System
### MUSKEGON ONLY ANALYSIS
# 1. Run the linear model
lm_muskNA_corr <- lm(LNA.cells ~ HNA.cells, data = musk_all_df)
## 2. Extract the R2 and p-value from the linear model:
lm_muskNA_corr_stats <- paste("atop(R^2 ==", round(summary(lm_muskNA_corr)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_muskNA_corr)$coefficients[,4][2]), digits = 9), ")")
musk_corr_plot <- ggplot(musk_all_df, aes(x = HNA.cells, y = LNA.cells)) +
geom_errorbarh(aes(xmin = HNA.cells - HNA.sd, xmax = HNA.cells + HNA.sd), color = "grey") +
geom_errorbar(aes(ymin = LNA.cells - LNA.sd, max = LNA.cells + LNA.sd), color = "grey") +
geom_point(size = 3, shape = 22, aes(fill = Lake)) +
geom_smooth(method = "lm", color = "black") +
ggtitle("Muskegon Lake") + scale_fill_manual(values = lake_colors) +
scale_x_continuous(labels = function(x) format(x, scientific = TRUE), limits = c(0, 6.1e+06)) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE), limits = c(0, 1e+07)) +
labs(y = "LNA Cell Density (cells/mL)", x = "HNA Cell Density (cells/mL)") +
annotate("text", x=5e+06, y=0.8e+06, label=lm_muskNA_corr_stats, parse = TRUE, color = "black", size = 3) +
mytheme
### INLAND ONLY ANALYSIS
# 1. Run the linear model
lm_inlaNA_corr <- lm(LNA.cells ~ HNA.cells, data = filter(inla_all_df, Sample_16S != "Z14003F"))
## 2. Extract the R2 and p-value from the linear model:
lm_inlaNA_corr_stats <- paste("atop(R^2 ==", round(summary(lm_inlaNA_corr)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_inlaNA_corr)$coefficients[,4][2]), digits = 2), ")")
inla_corr_plot <- ggplot(filter(inla_all_df, Sample_16S != "Z14003F"), aes(x = HNA.cells, y = LNA.cells)) +
geom_errorbarh(aes(xmin = HNA.cells - HNA.sd, xmax = HNA.cells + HNA.sd), color = "grey") +
geom_errorbar(aes(ymin = LNA.cells - LNA.sd, max = LNA.cells + LNA.sd), color = "grey") +
geom_point(size = 3, shape = 22, aes(fill = Lake)) +
geom_smooth(method = "lm", color = "black") +
scale_x_continuous(labels = function(x) format(x, scientific = TRUE), limits = c(0, 6.1e+06)) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE), limits = c(0, 1e+07)) +
ggtitle("Inland Lakes") + scale_fill_manual(values = lake_colors) +
labs(y = "LNA Cell Density (cells/mL)", x = "HNA Cell Density (cells/mL)") +
annotate("text", x=5e+06, y=0.8e+06, label=lm_inlaNA_corr_stats, parse = TRUE, color = "black", size = 3) +
mytheme
### MICHIGAN ONLY ANALYSIS
# 1. Run the linear model
lm_michNA_corr <- lm(LNA.cells ~ HNA.cells, data = mich_all_df)
# Without the Lake Michigan outlier!
summary(lm(LNA.cells ~ HNA.cells, data = filter(mich_all_df, Sample_16S != "M15S2F515")))
##
## Call:
## lm(formula = LNA.cells ~ HNA.cells, data = filter(mich_all_df,
## Sample_16S != "M15S2F515"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -599787 -239620 -95094 241831 987737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.832e+05 9.793e+04 5.956 3.37e-07 ***
## HNA.cells 1.200e+00 1.797e-01 6.678 2.78e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 354600 on 46 degrees of freedom
## Multiple R-squared: 0.4922, Adjusted R-squared: 0.4812
## F-statistic: 44.59 on 1 and 46 DF, p-value: 2.778e-08
## 2. Extract the R2 and p-value from the linear model:
lm_michNA_corr_stats <- paste("atop(R^2 ==", round(summary(lm_michNA_corr)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_michNA_corr)$coefficients[,4][2]), digits = 11), ")")
mich_corr_plot <- ggplot(mich_all_df, aes(x = HNA.cells, y = LNA.cells)) +
geom_errorbarh(aes(xmin = HNA.cells - HNA.sd, xmax = HNA.cells + HNA.sd), color = "grey") +
geom_errorbar(aes(ymin = LNA.cells - LNA.sd, max = LNA.cells + LNA.sd), color = "grey") +
geom_point(size = 3, shape = 22, aes(fill = Lake)) +
geom_smooth(method = "lm", color = "black") +
scale_x_continuous(labels = function(x) format(x, scientific = TRUE), limits = c(0, 6.1e+06)) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE), limits = c(0, 1e+07)) +
ggtitle("Lake Michigan") + scale_fill_manual(values = lake_colors) +
labs(y = "LNA Cell Density (cells/mL)", x = "HNA Cell Density (cells/mL)") +
annotate("text", x=5e+06, y=0.8e+06, label=lm_michNA_corr_stats, parse = TRUE, color = "black", size = 3) +
mytheme
plot_grid(mich_corr_plot, inla_corr_plot, musk_corr_plot,
labels = c("A", "B", "C"), nrow = 1, ncol = 3)

Figure 1B & 1D: Fraction HNA vs Productivity
## Plot the fraction of HNA
fmusk <- muskegon %>%
mutate(fHNA = HNA.cells/Total.cells,
fLNA = LNA.cells/Total.cells)
lm_fHNA <- lm(tot_bacprod ~ fHNA, data = fmusk)
## 2. Extract the R2 and p-value from the linear model:
lm_fHNA_stats <- paste("atop(R^2 ==", round(summary(lm_fHNA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_fHNA)$coefficients[,4][2]), digits = 3), ")")
fHNA_vs_prod <- ggplot(fmusk, aes(x = fHNA, y = tot_bacprod)) +
geom_errorbar(aes(ymin = tot_bacprod - SD_tot_bacprod, max = tot_bacprod + SD_tot_bacprod), color = "grey") +
geom_point(size = 3, aes(shape = Lake), fill = "deepskyblue4") +
geom_smooth(method = "lm", linetype = "longdash", color = "deepskyblue4") +
scale_x_continuous(limits = c(0.15, 0.9), breaks = seq(0.2, 0.9, by = 0.2)) +
ylab("Bacterial Production") + xlab("Fraction HNA") +
scale_shape_manual(values = lake_shapes) +
annotate("text", x= 0.22, y=60, label=lm_fHNA_stats, parse = TRUE, color = "black", size = 3) +
mytheme + theme(legend.position = "none")
### LNA Fraction
## Plot the fraction of HNA
lm_fLNA <- lm(tot_bacprod ~ fLNA, data = fmusk)
## 2. Extract the R2 and p-value from the linear model:
lm_fLNA_stats <- paste("atop(R^2 ==", round(summary(lm_fLNA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_fLNA)$coefficients[,4][2]), digits = 3), ")")
fLNA_vs_prod <- ggplot(fmusk, aes(x = fLNA, y = tot_bacprod)) +
geom_errorbar(aes(ymin = tot_bacprod - SD_tot_bacprod, max = tot_bacprod + SD_tot_bacprod), color = "grey") +
geom_point(size = 3, aes(shape = Lake), fill = "darkgoldenrod1") +
geom_smooth(method = "lm", color = "darkgoldenrod1", fill = "darkgoldenrod1", linetype = "longdash") +
scale_x_continuous(limits = c(0.15, 0.9), breaks = seq(0.2, 0.9, by = 0.2)) +
ylab("Bacterial Production") + xlab("Fraction LNA") +
scale_shape_manual(values = lake_shapes) +
annotate("text", x= 0.22, y=60, label=lm_fLNA_stats, parse = TRUE, color = "black", size = 3) +
mytheme + theme(legend.position = "none")
plot_grid(HNA_vs_prod + theme(legend.position = "none"),
fHNA_vs_prod,
LNA_vs_prod, fLNA_vs_prod,
labels = c("A", "B", "C", "D"), ncol = 2, nrow = 2,
align = "h")

Figure 5: Phylogenetic Tree
#### PHYLOGENETIC ANALYSIS
load("data/phyloseq.RData")
physeq.otu
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 13370 taxa and 859 samples ]
## tax_table() Taxonomy Table: [ 13370 taxa by 7 taxonomic ranks ]
# With all of the OTUs that pass the RL score threshold
otu_scores_df <- matrix_RLscores %>%
as.data.frame() %>%
tibble::rownames_to_column("OTU")
# Subset the vector with names of the OTUs
vector_of_otus <- as.vector(otu_scores_df$OTU)
# Write to a file
#write(vector_of_otus, file = "data/fasttree/Figure5/OTUnames_RLscores_258.txt",
# ncolumns = 1, append = FALSE, sep = "\n")
physeq <- physeq.otu %>%
subset_taxa(., taxa_names(physeq.otu) %in% vector_of_otus)
# Which OTU is missing?
setdiff(sort(vector_of_otus), sort(taxa_names(physeq)))
## character(0)
setdiff(sort(taxa_names(physeq)), sort(vector_of_otus))
## character(0)
######################################### FASTTREE
fast_tree <- read.tree(file="data/fasttree/Figure5/newick_tree_OTUs258.tre")
fast_tree_tip_order <- data.frame(fast_tree$tip.label) %>%
rename(OTU = fast_tree.tip.label)
# Make sure the OTUs match
stopifnot(sort(vector_of_otus)==sort(as.character(fast_tree_tip_order$OTU)))
fasttree_physeq <- merge_phyloseq(physeq, phy_tree(fast_tree))
# Fix the taxonomy names
colnames(tax_table(fasttree_physeq)) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
###################################################################### ADD THE PROTEOBACTERIA TO THE PHYLA
phy <- data.frame(tax_table(fasttree_physeq))
Phylum <- as.character(phy$Phylum)
Class <- as.character(phy$Class)
for (i in 1:length(Phylum)){
if (Phylum[i] == "Proteobacteria"){
Phylum[i] <- Class[i] } }
phy$Phylum <- Phylum # Add the new phylum level data back to phy
t <- tax_table(as.matrix(phy))
tax_table(fasttree_physeq) <- t
fasttree_tax <- data.frame(tax_table(fasttree_physeq)) %>%
tibble::rownames_to_column(var = "OTU")
## Let's root the tree
is.rooted(fast_tree)
## [1] FALSE
rooted_tree <- root(fast_tree, outgroup = "Otu001533", resolve.root = TRUE)
is.rooted(rooted_tree) # Sanity Check
## [1] TRUE
# Need to make a manual file based off of FIgure 4 - it starts with OTU names
# write.csv(df_RLscores, file = "data/fasttree/Figure5/OTUs258_MANUAL.csv", row.names = FALSE)
# Next I manually filled in the fcm_type and lake columns :/
phyfcm_fasttree_df <- read.csv("data/fasttree/Figure5/OTUs258_MANUAL.csv") %>%
left_join(., fasttree_tax, by = "OTU") %>%
tibble::column_to_rownames("OTU") %>%
dplyr::select(Phylum, FCM)
rooted_tree_plot <-
ggplot(rooted_tree, aes(x, y)) + geom_tree() + theme_tree() +
geom_tiplab(size=3, align=TRUE, linesize=.5) #+
#geom_nodelab(vjust=-.5, size=3)
gheatmap(rooted_tree_plot, phyfcm_fasttree_df, offset = 0.1, width=0.3, font.size=3, colnames_angle=0, hjust=0.5) +
scale_fill_manual(values = phylum_colors,
breaks = c("Acidobacteria","Actinobacteria", "Alphaproteobacteria", "Bacteria_unclassified", "Bacteroidetes", "Betaproteobacteria",
"Chlorobi", "Chloroflexi","Cyanobacteria","Deltaproteobacteria", "Epsilonproteobacteria", "Firmicutes",
"Gammaproteobacteria","Gemmatimonadetes","Omnitrophica","Parcubacteria","Planctomycetes",
"Proteobacteria_unclassified", "TA18","unknown_unclassified", "Verrucomicrobia",
# FCM Groups
"Both", "HNA", "LNA")) +
guides(fill=guide_legend(ncol=1))
